---------------------------------------------------------------------------------------------------------------------------------
      name:  <unnamed>
       log:  /afs/econ.duke.edu/data/apm16/Dropout/y97/RawData/AFQT_MATCHING_VERB/AFQT_MATCHING_VERB_with_weights.log
  log type:  text
 opened on:  25 Nov 2013, 16:24:18

. 
. /***************************************************
> MATCHING AFQT SCORES ACROSS NLSY 1979 and NLSY 1997
> 
> This do file creates a data file with comparable AFQT scores across both NLSY79 and NLSY97.
> There are two main steps in creating the comparable AFQT scores:
> 
> 1. The 1979 ASVAB is a Paper and Pencil (P&P) test, while the 1997 ASVAB was computer adminstered. 
> To make the scores comparable across cohorts, we rely on a percentile mapping provided by Dan Segall (Segall (1997))
> 
> 2. The age at which respondents took the test differs between 1979 and 1997. The 1997 sample is much younger.
> For both samples, we observe a large sample of individuals taking the test at age 16. We use this overlap in the 
> test-taking age by mapping all test scores within cohorts into the age 16-distribution based on the within age 
> ranking of test scores. 
> 
> For details, see Segall (1997) and Altonji, Bharadwaj & Lange (2009).
> 
> Altonji, J., Bharadwaj, P. & Lange, F. "Changes in the Characteristics of American Youth - 
> Implications for Adult Outcomes" NBER Working Papers No. 13883, revised 2009.
> Segall, D. O. (1997). "Equating the CAT-ASVAB". In W. A. Sands, B. K. Waters, & J. R. McBride (Eds.), 
>         Computerized adaptive testing: From inquiry to operation (pp. 181-198). Washington, DC: American Psychological Associat
> ion. 
> Date: August 19, 2009.
> ******************************************************/
. 
. // Set path to directory containing afqt1997a.csv //
. capture cd "/afs/econ.duke.edu/data/apm16/Dropout/y97/RawData/AFQT_MATCHING_VERB"

. * cd "/Users/JKukkur/Documents/Research/ABL/AFQT MATCHING"
. 
. tempfile afqt97 afqt_append nlsy_agestd agestd_afqt missings finished_product

. 
. /****************************************************************
> First Step of score conversion: 
> Transfrom CAT Test Scores from NLSY97 into Paper and Pencil Test Scores using mapping provided by Dan Segall.
> Combine this data with raw data from NLSY79.
> *****************************************************************/
. 
. 
. // afqt1997a.csv contains individual id's and sex from NLSY1997 and the ASVAB component scores provided by Dan Segall. 
. // The are P&P equivalent scores based on the mapping procedure described in Segall (1996). 
. // Dan Segall suuplied us with these P&P equivalent scores using ASVAB component scores contained in (DICTIONARY FILE).
. insheet using afqt1997a.csv, comma
(36 vars, 8984 obs)

. ren v1 pid

. ren v2 male

. egen asvabVerb=rowmean(pc wk) if wk!=0 & pc!=0 // pc=paragraph comprehension, no=numerical comprehension 
(1881 missing values generated)

. keep pid male asvabVerb

. sort pid

. save `afqt97', replace
(note: file /tmp/St15170.00001t not found)
file /tmp/St15170.00001t saved

. 
. // Merge age in for the NLSY97 sample //
. infile using age97.dct, clear

infile dictionary {
  R0000100 "PUBID - YTH ID CODE 1997"
  R1194100 "CV_AGE_INT_DATE 1997"
  R2553500 "CV_AGE_INT_DATE 1998"
  R3876300 "CV_AGE_INT_DATE 1999"
  R5453700 "CV_AGE_INT_DATE 2000"
  R7216000 "CV_AGE_INT_DATE 2001"
  S1531400 "CV_AGE_INT_DATE 2002"
  S2001000 "CV_AGE_INT_DATE 2003"
  S3801100 "CV_AGE_INT_DATE 2004"
}

(8984 observations read)

. ren R0000100    pid

. ren R1194100    age  // age as of 1997 (test-taking year for NLSY97) //

. keep pid age

. sort pid

. merge pid using `afqt97' 
(note: you are using old merge syntax; see [D] merge for new syntax)

. drop _merge 

. sort pid

. save `afqt97', replace
file /tmp/St15170.00001t saved

. 
. // Merge in weights for 1997 data //
. // We use the custom weight provided by the NLSY for the year 1997, the year when the ASVAB was administered. //
. insheet using weights97.csv, clear
(2 vars, 8984 obs)

. sort pid

. merge pid using `afqt97'
(note: you are using old merge syntax; see [D] merge for new syntax)
pid was int now float

. drop _merge 

. gen sample=1                            // Sample Identifier: 1= 1997 NLSY sample, 0=1979 NLSY sample //

. sort sample pid

. save `afqt97', replace
file /tmp/St15170.00001t saved

. 
. // NLSY 1979 Sample: Age Information and AFQT-scores //
. infile using nlsy79_vars.dct, clear

infile dictionary {
  R0000100 "ID# (1-12686) 79"
  R0000500 "DATE OF BIRTH - YR 79"
  R0173600 "SAMPLE ID  79 INT"
  R0406510 "AGE OF R @ INT DATE 80"
  R0618011 "PROFILES ASVAB SEC 2-STD SCRNR 81"
  R0618012 "PROFILES ASVAB SEC 3-STD SCRNR 81"
  R0618013 "PROFILES ASVAB SEC 4-STD SCRNR 81"
  R0618014 "PROFILES ASVAB SEC 5-STD SCRNR 81"
}

(12686 observations read)

. ren R0000100 pid                

. ren R0406510 age                                                                        // Age as of 1980 (test taking year for
>  NLSY79) //

. ren R0000500 birthyear 

. ren R0173600 sampid

. ren R0618011 ar

. ren R0618012 wk

. ren R0618013 pc

. ren R0618014 no

. qui replace ar=. if ar<0

. qui replace wk=. if wk<0

. qui replace pc=. if pc<0

. qui replace no=. if no<0

. egen asvabVerb = rowmean(pc wk) if wk!=0 & pc!=0
(808 missing values generated)

. drop if asvabVerb==.
(808 observations deleted)

. label var asvabVerb "Math subset of ASVAB score"

. replace age=80-birthyear if age<0 & birthyear!=.        // Fill missing age using birth-year // 
(202 real changes made)

. drop birthyear

. gen sample=0                                                    // Sample Identifier: 1= 1997 NLSY sample, 0=1979 NLSY sample /
> /

. append using `afqt97'                           // Append the data-set for NLSY97 //

. sort sample pid

. save `afqt_append', replace
(note: file /tmp/St15170.00001u not found)
file /tmp/St15170.00001u saved

. 
. * Merge in 1979 weights
. // We use the custom weight provided by the NLSY for the year 1979, the year when the ASVAB was administered. //
. 
. insheet using weights79.csv, clear
(2 vars, 12686 obs)

. gen sample=0

. sort sample pid

. merge sample pid using `afqt_append'
(note: you are using old merge syntax; see [D] merge for new syntax)
pid was int now float

. drop _merge

. 
. * The weights have implied 2 decimal places.
. replace weight=weight/100
weight was long now double
(21670 real changes made)

. sort sample pid

. save `afqt_append', replace
file /tmp/St15170.00001u saved

. 
. /***************************************************************************
> Second Step: Percentile mapping of P&P test scores into age=16 distribution
> ****************************************************************************/
. 
. *****           GENERATING PERCENTILES OF SCORES BY AGE AND NLSY-SAMPLE *********
. use `afqt_append', clear

. drop ar wk pc no

. 
. // Drop those with missing AFQT scores
. drop if asvabVerb==.
(2689 observations deleted)

. 
. // For Table I in Introduction.doc
. bysort sample: tab age

---------------------------------------------------------------------------------------------------------------------------------
-> sample = 0

 AGE OF R @ |
INT DATE 80 |      Freq.     Percent        Cum.
------------+-----------------------------------
         15 |        962        8.10        8.10
         16 |      1,511       12.72       20.82
         17 |      1,488       12.53       33.35
         18 |      1,432       12.06       45.40
         19 |      1,502       12.65       58.05
         20 |      1,558       13.12       71.17
         21 |      1,539       12.96       84.12
         22 |      1,529       12.87       96.99
         23 |        357        3.01      100.00
------------+-----------------------------------
      Total |     11,878      100.00

---------------------------------------------------------------------------------------------------------------------------------
-> sample = 1

 AGE OF R @ |
INT DATE 80 |      Freq.     Percent        Cum.
------------+-----------------------------------
         12 |        964       13.57       13.57
         13 |      1,409       19.84       33.41
         14 |      1,480       20.84       54.24
         15 |      1,492       21.01       75.25
         16 |      1,325       18.65       93.90
         17 |        430        6.05       99.96
         18 |          3        0.04      100.00
------------+-----------------------------------
      Total |      7,103      100.00


. 
. 
. // For Figure 1 in NSLY79
. kdensity asvabVerb if sample==0&age==16, addplot(kdensity asvabVerb if sample==1&age==16, lpattern(_)) title(Figure 1: AFQT Sco
> res at Age 16) ///
> note("The NLSY79-scores are the P&P scores reported by the NLSY79." "The NLSY-97 scores are based on the CAT scores from NLSY97
>  and the equation by Segal (1997)." "Both populations are weighted to be population representative.") ///
> legend(label(1 "NLSY 1979") label(2 "NLSY 1997") cols(2)) saving(HistScores, replace)
(file HistScores.gph saved)

. 
. 
. // Combine sparsely populated age-groups in both samples with adjacent age-groups //
. replace age=22 if age==23 & sample==0
(357 real changes made)

. replace age=17 if age==18 & sample==1
(3 real changes made)

. 
. *******************************************************************
. *EXPANDING THE DATA AND CREATING PERCENTILES BY HAND
. // The following procedure improves the quality of the percentile mapping. 
. // The problem is that some observations 'belong' in several percentiles because they 
. // have large weights. Stata commands such as xtile will simply assign a unique percentile to these
. // observations. Instead, we need to account for the fact that these observations belong to several pctiles. 
. // This is achieved by expanding the data-set proportionally to the weights and then generating percentiles. 
. 
. *expanding each observation by its weight - an observation with a weight of 1100 is expanded into 11 observations. 
. gen percentile_rank=.
(18981 missing values generated)

. replace weight=round(weight/100)
(18981 real changes made)

. foreach num of numlist 0/160 {
  2.         qui expand `num' if weight==`num'
  3. }

. 
. *generating a unique rank within each sample and age
. bysort sample age: egen r=rank(asvabVerb), u

. gen pasvabVerb=.
(471020 missing values generated)

. 
. * Divide the rank by number of individuals corresponding to the population of a given age and sample
. * to get the percentile of an individual.
. 
. gen holdey = 1/100

. bysort sample age: egen sumWgts=total(holdey)

. 
. *SAMPLE==0
. qui replace pasvabVerb=round(r/sumWgts) if age==15 & sample==0

. qui replace pasvabVerb=round(r/sumWgts) if age==16 & sample==0

. qui replace pasvabVerb=round(r/sumWgts) if age==17 & sample==0

. qui replace pasvabVerb=round(r/sumWgts) if age==18 & sample==0

. qui replace pasvabVerb=round(r/sumWgts) if age==19 & sample==0

. qui replace pasvabVerb=round(r/sumWgts) if age==20 & sample==0

. qui replace pasvabVerb=round(r/sumWgts) if age==21 & sample==0

. qui replace pasvabVerb=round(r/sumWgts) if age==22 & sample==0

. 
. *SAMPLE 1
. qui replace pasvabVerb=round(r/sumWgts) if age==12 & sample==1

. qui replace pasvabVerb=round(r/sumWgts) if age==13 & sample==1

. qui replace pasvabVerb=round(r/sumWgts) if age==14 & sample==1

. qui replace pasvabVerb=round(r/sumWgts) if age==15 & sample==1

. qui replace pasvabVerb=round(r/sumWgts) if age==16 & sample==1

. qui replace pasvabVerb=round(r/sumWgts) if age==17 & sample==1

. 
. *dropping the duplicates now
. egen tag=tag(sample pid)

. keep if tag==1
(452039 observations deleted)

. drop tag

. 
. sort sample pasvabVerb

. save `nlsy_agestd', replace
(note: file /tmp/St15170.00001v not found)
file /tmp/St15170.00001v saved

. 
. *****************************************************************************
. 
. // Within sample, we map AFQT scores by age in the age=16
. // distribution. We therefore require mean AFQT-scores of age=16 by percentile in each sample. 
. // We need to generate these averages using the weights.
. bys sample pasvabVerb: egen pop=sum(weight) if age==16
(16145 missing values generated)

. bys sample pasvabVerb: egen tot_score=sum(asvabVerb*weight) if age==16  // Mean age=16 raw score for each percentile //
(16145 missing values generated)

. bys sample pasvabVerb: gen mean=tot_score/pop if age==16
(16145 missing values generated)

. drop tot_score pop

. egen tag=tag(sample pasvabVerb mean) if age==16                         // We need only 1 obs per sample and percentile //

. keep if tag==1
(18779 observations deleted)

. ren mean asvabVerb_std

. keep sample asvabVerb_std pasvabVerb asvabVerb

. sort sample pasvabVerb

. save `agestd_afqt', replace
(note: file /tmp/St15170.00001w not found)
file /tmp/St15170.00001w saved

. 
. use `nlsy_agestd', clear

. merge sample pasvabVerb using `agestd_afqt' // Merge into each percentile the age=16 corresponding score //
(note: you are using old merge syntax; see [D] merge for new syntax)
variables sample pasvabVerb do not uniquely identify observations in the master data

. drop _merge

. keep sample pid male pasvabVerb asvabVerb_std asvabVerb weight age

. sort sample pid

. // The final data contains "asvabVerb_std": which is the comparable score across the 2 NLSY samples //
. *keep if sample==1 // this drops the NLSY79 observations //
. ren pid ID

. ren weight weight_altonji

. gen year = 1979 if sample==0
(7103 missing values generated)

. replace year = 1997 if sample==1 
(7103 real changes made)

. *drop sample age asvabVerb pasvabVerb // drop the other variables since I don't use them in my research //
. preserve

.         use `afqt97', clear

.         keep pid male

.         ren pid ID

.         save `missings', replace
(note: file /tmp/St15170.00001x not found)
file /tmp/St15170.00001x saved

. restore

. 
. preserve

.         keep if sample==1
(11878 observations deleted)

.         merge 1:1 ID using `missings'

    Result                           # of obs.
    -----------------------------------------
    not matched                         1,881
        from master                         0  (_merge==1)
        from using                      1,881  (_merge==2)

    matched                             7,103  (_merge==3)
    -----------------------------------------

.         tab _merge

                 _merge |      Freq.     Percent        Cum.
------------------------+-----------------------------------
         using only (2) |      1,881       20.94       20.94
            matched (3) |      7,103       79.06      100.00
------------------------+-----------------------------------
                  Total |      8,984      100.00

.         zscore asvabVerb_std
z_asvabVerb_std created with 1881 missing values

.         drop asvabVerb_std

.         ren z_asvabVerb_std asvabVerb_std

.         keep if male==1
(4385 observations deleted)

.         sort ID

.         outsheet ID asvabVerb_std using altonjiAFQT.csv, comma nol replace

. restore

. save afqt_adjusted_final, replace 
file afqt_adjusted_final.dta saved

. save `finished_product', replace
(note: file /tmp/St15170.000020 not found)
file /tmp/St15170.000020 saved

. 
. tabstat asvabVerb_std, by(sample) stats(n min max mean median)

Summary for variables: asvabVerb_std
     by categories of: sample 

  sample |         N       min       max      mean       p50
---------+--------------------------------------------------
       0 |     11878  21.28502      61.5  44.38163  45.48201
       1 |      7103        20      61.5  45.41536  47.72314
---------+--------------------------------------------------
   Total |     18981        20      61.5  44.76847  46.40996
------------------------------------------------------------

. 
. isid ID year

. 
. * Test: the afqt distributions across age within sample should now be identical. There will still be very small deviations
. * because of the coarseness of the above expansion, but we believe these to be third order. To allow this code to run 
. * rapidly, we tolerate these deviations. 
. 
. **bys sample age: sum asvabVerb_std [fw=weight], d
. 
. use `finished_product', clear

. ** Generate a NLSY79-only supplement:
. drop if sample==1
(7103 observations deleted)

. save afqt_adjusted_final79, replace
file afqt_adjusted_final79.dta saved

. 
. use `finished_product', clear

. ** Generate a NLSY97-only supplement:
. drop if sample==0
(11878 observations deleted)

. save afqt_adjusted_final97, replace
file afqt_adjusted_final97.dta saved

. 
. log close
      name:  <unnamed>
       log:  /afs/econ.duke.edu/data/apm16/Dropout/y97/RawData/AFQT_MATCHING_VERB/AFQT_MATCHING_VERB_with_weights.log
  log type:  text
 closed on:  25 Nov 2013, 16:24:28
---------------------------------------------------------------------------------------------------------------------------------
